# -*- coding: utf-8 -*-
# time: 2025/5/14 11:48
# file: pdf_01.py
# author: hanson
"""
https://www.langchain.com.cn/docs/how_to/document_loader_pdf/
"""
from langchain_community.document_loaders import PyPDFLoader

file_path = r"C:\Users\Administrator\Downloads\26897537.pdf"
loader = PyPDFLoader(file_path)
pages = []
for page in loader.load():
    pages.append(page)

print(f"{pages[0].metadata}\n")
content = pages[0].page_content
import re
# 删除HTML标签
content = re.sub(r'<[^>]*>', '', content)
# 删除换行符、制表符和多余的空格
content = re.sub(r'[\n\t\s]+', ' ', content).strip()
print(content)
